self::HTML_NAMESPACE => [
'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
'frame' => true,
- 'plaintext' => true, 'isindex' => true, 'textarea' => true,
+ 'plaintext' => true, 'isindex' => true,
'xmp' => true, 'iframe' => true, 'noembed' => true,
'noscript' => true, 'script' => true,
'title' => true
]
];
+ public static $extraLinefeedSet = [
+ self::HTML_NAMESPACE => [
+ 'pre' => true, 'textarea' => true, 'listing' => true,
+ ]
+ ];
+
public static $headingSet = [
self::HTML_NAMESPACE => [
'h1' => true, 'h2' => true, 'h3' => true,
}
if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
$out = "<{$this->localName}{$encAttribs}>";
+ $len = strlen( $out );
// flatten children
foreach ( $this->children as $elt ) {
$out .= "{$elt}";
}
$out .= "</{$this->localName}>";
+ if (
+ $this->isA( BalanceSets::$extraLinefeedSet ) &&
+ $out[$len] === "\n"
+ ) {
+ // Double the linefeed after pre/listing/textarea
+ // according to the HTML5 fragment serialization algorithm.
+ $out = substr( $out, 0, $len + 1 ) .
+ substr( $out, $len );
+ }
} else {
$out = "<{$this->localName}{$encAttribs} />";
Assert::invariant(
* - The document is never in "quirks mode".
* - All occurrences of < and > have been entity escaped, so we
* can parse tags by simply splitting on those two characters.
+ * (This also simplifies the handling of < inside <textarea>.)
* The character < must not appear inside comments.
* Similarly, all attributes have been "cleaned" and are double-quoted
* and escaped.
* - All null characters are assumed to have been removed.
- * - We don't alter linefeeds after <pre>/<listing>.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- * <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
+ * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
* <noembed>, <noscript>, <script>, <title>. As a result,
* further simplifications can be made:
* - `frameset-ok` is not tracked.
* - `head element pointer` is not tracked (but presumed non-null)
- * - Tokenizer has only a single mode.
+ * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
+ * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
*
* We generally mark places where we omit cases from the spec due to
* disallowed elements with a comment: `# OMITTED: <element-name>`.
private $tidyCompat;
private $allowComments;
- private $textIntegrationMode = false;
+ private $textIntegrationMode;
private $pendingTableText;
private $originalInsertionMode;
private $fragmentContext;
private $formElementPointer;
+ private $ignoreLinefeed;
+ private $inRCDATA;
+ private $inRAWTEXT;
/**
* Valid HTML5 comments.
$this->processingCallback = $processingCallback;
$this->processingArgs = $processingArgs;
+ $this->textIntegrationMode =
+ $this->ignoreLinefeed =
+ $this->inRCDATA =
+ $this->inRAWTEXT = false;
+
# The stack is constructed with an <html> element already on it.
# Set this up as a fragment parsed with <body> as the context.
$this->fragmentContext =
# Don't actually inject the empty string as a text token.
return true;
}
+ // Support pre/listing/textarea by suppressing initial linefeed
+ if ( $this->ignoreLinefeed ) {
+ $this->ignoreLinefeed = false;
+ if ( $token === 'text' ) {
+ if ( $value[0] === "\n" ) {
+ if ( $value === "\n" ) {
+ # Nothing would be left, don't inject the empty string.
+ return true;
+ }
+ $value = substr( $value, 1 );
+ }
+ }
+ }
// Some hoops we have to jump through
$adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
# are stripped in the Sanitizer) but may be generated by extensions.
if (
$this->allowComments &&
+ !( $this->inRCDATA || $this->inRAWTEXT ) &&
preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
/* verify EOF condition where necessary */
( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
$slash = $t = $attribStr = $brace = $rest = null;
}
$goodtag = $t;
+ if ( $this->inRCDATA ) {
+ if ( $slash && $t === $this->inRCDATA ) {
+ $this->inRCDATA = false;
+ } else {
+ // No tags allowed; this emulates the "rcdata" tokenizer mode.
+ $goodtag = false;
+ }
+ }
+ if ( $this->inRAWTEXT ) {
+ if ( $slash && $t === $this->inRAWTEXT ) {
+ $this->inRAWTEXT = false;
+ } else {
+ // No tags allowed, no entity-escaping done.
+ $goodtag = false;
+ }
+ }
$sanitize = $this->allowedHtmlElements !== null;
if ( $sanitize ) {
$goodtag = $t && isset( $this->allowedHtmlElements[$t] );
if ( $goodtag ) {
$rest = str_replace( '>', '>', $rest );
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
+ } elseif ( $this->inRAWTEXT ) {
+ $this->insertToken( 'text', "<$x" );
} else {
# bad tag; serialize entire thing as text.
$this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
private function parseRawText( $value, $attribs = null ) {
$this->stack->insertHTMLElement( $value, $attribs );
- // XXX switch tokenizer to rawtext state?
+ $this->inRAWTEXT = $value;
$this->originalInsertionMode = $this->switchMode( 'inTextMode' );
return true;
}
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
- # As described in "simplifications" above:
- # 1. We don't touch the next token, even if it's a linefeed.
- # 2. OMITTED: frameset_ok
+ $this->ignoreLinefeed = true;
+ # OMITTED: frameset_ok
return true;
case 'form':
return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
# OMITTED: <isindex>
- # OMITTED: <textarea>
+
+ case 'textarea':
+ $this->stack->insertHTMLElement( $value, $attribs );
+ $this->ignoreLinefeed = true;
+ $this->inRCDATA = $value; // emulate rcdata tokenizer mode
+ # OMITTED: frameset_ok
+ return true;
+
# OMITTED: <xmp>
# OMITTED: <iframe>
# OMITTED: <noembed>